import re
import scrapy
import datetime
from books.pipelines import MySQLPipeline
from books.items import AuthorItem, BookInfosItem, BookDetailsItem

class XbqgSpiderSpider(scrapy.Spider):
    name = 'xbqg_spider'
    allowed_domains = ['xbiquge.la']
    start_urls = ['https://www.xbiquge.la/']

    cate_name_data = {'xuanhuan': 1, 'xiuzhen': 2, 'dushi': 3, 'chuanyue': 4, 'wangyou': 5, 'kehuan': 6}
    book_cate_data = {} # 书籍分类表字段数据

    # 获取各分类小说链接
    def parse(self, response):
        db = MySQLPipeline()
        cate_data = db.get_book_cate_data() # 查询书籍分类数据
        for d in cate_data:
            self.book_cate_data[d[1]] = d[0]

        lis = response.xpath('//div[@class="nav"]/ul/li')
        for li in lis[2:-2]:
            cate_name = li.xpath('./a/text()').extract_first()
            cate_name = cate_name.strip('小说') if cate_name.endswith('小说') else cate_name
            cate_url = li.xpath('./a/@href').extract_first()
            if not cate_url.startswith('http'):
                cate_url = 'https://www.xbiquge.la' + cate_url
            print(cate_name, cate_url)
            yield scrapy.Request(url=cate_url, callback=self.get_book_cate_pages)

    # 获取各分类小说的所有页数
    def get_book_cate_pages(self, resposne):
        page_num = resposne.xpath('//div[@id="pagelink"]/a[last()]/text()').extract_first()
        cate_pinyin = resposne.url.split('/')[-2].replace('xiaoshuo', '')
        print('————————————————————————————' + cate_pinyin + '一共有{}页——————————————————————————————'.format(page_num))

        # # 测试
        # if 'xuanhuan' in resposne.url:
        #     for pn in range(1, 2, 1):
        #         xuanhuan_url = f'https://www.xbiquge.la/fenlei/1_{pn}.html'
        #         yield scrapy.Request(url=xuanhuan_url, callback=self.get_book_cate_list_data)

        # 正式
        if cate_pinyin in resposne.url:
            cate_num = self.cate_name_data[cate_pinyin]
            for pn in range(1, int(page_num) + 1, 1):
                detail_url = 'https://www.xbiquge.la/fenlei/{a}_{b}.html'.format(a=cate_num, b=pn)
                yield scrapy.Request(url=detail_url, callback=self.get_book_cate_list_data)

    # 获取各分类小说列表数据
    def get_book_cate_list_data(self, response):
        db = MySQLPipeline()

        # # 测试
        # data_lists = ['https://www.xbiquge.la/88/88055/']
        # for book_url in data_lists:
        #     author_name = '夜下墨白'
        #     book_name = '星界之尊'
        #     new_chapter_name = '第393章 休想'

        # 正式
        lis = response.xpath('//div[@id="newscontent"]/div[@class="l"]/ul/li')
        for li in lis:
            book_name = li.xpath('./span[@class="s2"]/a/text()').extract_first()        # 小说名称
            # 作者名称
            author_name = li.xpath('./span[@class="s5"]/text()').extract_first()
            author_name = author_name.strip() if author_name else ''
            book_url = li.xpath('./span[@class="s2"]/a/@href').extract_first()          # 小说详情页链接
            new_chapter_name = li.xpath('./span[@class="s3"]/a/text()').extract_first() # 小说最新章节名

            # 查看作者是否存在
            author_result = db.get_exists_book_author(author_name)
            if not author_result:
                author_item = AuthorItem(author_name=author_name)
                yield author_item

            book_result = db.get_exists_book_name(book_name, '1', author_name)   # 查询该小说是否存在
            if book_result:
                # 存在该书籍，则判断是否需要更新最新章节
                if book_result[0] == new_chapter_name:
                    # 相等则表示该小说无需更新
                    print('该书籍- {} -已是最新章节数据！！！！'.format(book_name))
                else:
                    yield scrapy.Request(url=book_url, callback=self.get_book_detail_info_data, meta={'update_type': 1})
            else:
                # 不存在该书籍
                # print('**********正在下载由 ' + author_name + ' 所著的作品：' + book_name, book_url)
                yield scrapy.Request(url=book_url, callback=self.get_book_detail_info_data, meta={'update_type': 0})

    # 获取指定小说数据信息
    def get_book_detail_info_data(self, response):
        db = MySQLPipeline()
        # 生成当前时间
        tds = datetime.datetime.now()
        news_time = datetime.datetime.strftime(tds, '%Y-%m-%d %H:%M:%S')

        update_type = response.meta['update_type']  # 书籍情况，0为下载，1为更新
        book_id = response.url.split('/')[-2]     # 书籍详情页id

        # 小说详情页章节
        dds = response.xpath('//div[@id="list"]/dl/dd')

        # 最后更新时间
        update_time = response.xpath('//div[@id="info"]/p[3]/text()').extract_first().replace('最后更新：', '')
        # 最新章节名称
        newest_name = response.xpath('//div[@id="info"]/p[4]/a/text()').extract_first()
        newest_name = newest_name if newest_name else ''
        # 最新章节id
        newest_href = response.xpath('//div[@id="info"]/p[4]/a/@href').extract_first()
        newest_id = newest_href.split('/')[-1].replace('.html', '')

        # 作者名称
        author_name = response.xpath('//div[@id="info"]/p[1]/text()').extract_first()
        author_name = re.sub(r'[\t\s\n作者：]', '', author_name)
        book_name = response.xpath('//div[@id="info"]/h1/text()').extract_first()  # 书籍名称

        # 从数据库中拿到作者id
        author_result = db.get_exists_book_author(author_name)
        author_id = author_result[0] if author_result else ''

        if update_type == 0:    # 下载数据
            # 书籍分类
            book_cate = response.xpath('//div[@class="con_top"]/a[2]/text()').extract_first().replace('小说','')
            cate_id = self.book_cate_data[book_cate]    # 书籍分类id

            # 书籍简介
            book_desc = response.xpath('//div[@id="intro"]/p[2]/text()').extract_first()
            # 书籍图片链接
            book_img_url = response.xpath('//div[@id="fmimg"]/img/@src').extract_first()

            book_info_item = BookInfosItem(bid=str(book_id), cid=str(cate_id), aid=str(author_id), did=str(newest_id), sid='1',
                                           book_name=book_name, created_time=news_time, update_time=update_time,
                                           newest_name=newest_name, book_desc=book_desc, img_url=book_img_url,
                                           detail_url=response.url)

            yield book_info_item

            for dd in dds:
                # 详情页章节名称
                book_detail_title = dd.xpath('./a/text()').extract_first()
                # 详情页章节链接
                book_detail_url = dd.xpath('./a/@href').extract_first()
                book_detail_url = 'https://www.xbiquge.la' + book_detail_url
                yield scrapy.Request(url=book_detail_url, callback=self.get_book_detail_content_data, meta={'detail_title': book_detail_title})

        else:
            # 更新章节

            # 优先更新最新书籍信息
            db.change_appoint_book_infos(newest_id, update_time, newest_name, book_id, '1', author_id)

            results = db.get_appoit_book_detail_id(book_id, '1')   # 获取指定书籍所有章节id
            result_list = [str(i[0]) for i in results]

            book_detail_list = []   # 此列表数据是 添加(章节名称, 章节id)元组数据
            diff_code = (len(dds) - len(result_list))
            if diff_code == 1:
                # 若差集等于1，则表示只需更新最新章节即可
                book_detail_url = response.url + newest_id + '.html'
                yield scrapy.Request(url=book_detail_url, callback=self.get_book_detail_content_data, meta={'detail_title': newest_name})
            elif diff_code > 1:
                # 若差集大于1的情况则筛选有可能需要更新的章节id
                for dd in dds:
                    # 详情页章节名称
                    book_detail_title = dd.xpath('./a/text()').extract_first()
                    # 详情页章节链接
                    detail_url = dd.xpath('./a/@href').extract_first()
                    book_detail_id = detail_url.split('/')[-1].replace('.html', '')
                    book_detail_list.append((book_detail_title, book_detail_id))

                # 求得差集即为需要更新的章节id列表数据
                difference_data = list(set([i[1] for i in book_detail_list]) ^ set(result_list))
                # 最终结果数据例如：[('第392章 计策', '38210711'), ('第390章 联姻', '38194006'), ('第393章 休想', '38210712')]
                final_data = [j for i in difference_data for j in book_detail_list if i == j[1]]
                for d in final_data:
                    book_detail_url = response.url + d[1] + '.html'
                    yield scrapy.Request(url=book_detail_url, callback=self.get_book_detail_content_data, meta={'detail_title': d[0]})

            elif diff_code == 0:
                print('书籍 {} 无需更新！！！！'.format(book_name))

    # 获取小说详情页章节内容数据
    def get_book_detail_content_data(self, response):
        detail_title = response.meta['detail_title']

        book_url_list = response.url.split('/')
        book_id = book_url_list[-2]                         # 书籍id
        detail_id = book_url_list[-1].replace('.html','')   # 章节id

        book_name = response.xpath('//div[@class="con_top"]/a[3]/text()').extract_first()
        # 章节内容
        html = response.text
        detail_contents = str(re.findall(r'<div id="content">(.*?)<p><a href=', html))
        detail_contents = detail_contents.replace("['","").replace("']","")

        book_detail_item = BookDetailsItem(bid=book_id, did=detail_id, sid="1", detail_title=detail_title, detail_contents=detail_contents)
        yield book_detail_item
        print('{a} - {b} 下载完成！！！！！'.format(a=book_name, b=detail_title))
